Code
library(readxl)
sheets = excel_sheets("all_data.xlsx")
print(sheets)[1] "data_dictionary" "experiment1" "experiment2" "experiment3"
[5] "pooled123"
Source, structure, reclassification, limitation of data collection, assumptions, define meaning of key variables, describe any assumption (idea about the data), data cleaning.
library(readxl)
sheets = excel_sheets("all_data.xlsx")
print(sheets)[1] "data_dictionary" "experiment1" "experiment2" "experiment3"
[5] "pooled123"
all_data = read_excel("all_data.xlsx", sheet = sheets[5])
head(all_data)# A tibble: 6 × 82
Date PID BSSQ_1 BSSQ_2 BSSQ_3 BSSQ_4 BSSQ_5 BSSQ_6 BSSQ_7
<dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2021-06-30 16:55:00 1201 0 0 0 0 0 0 0
2 2021-06-30 17:46:00 1351 4 2 0 1 4 0 0
3 2021-06-30 18:25:00 2351 1 7 0 0 2 0 0
4 2021-06-30 18:59:00 1151 0 0 0 0 0 0 0
5 2021-06-30 20:25:00 2201 0 0 0 0 0 0 0
6 2021-06-30 21:05:00 1251 2 5 1 0 4 0 0
# ℹ 73 more variables: BSSQ_8 <dbl>, BSSQ_9 <dbl>, BSSQ_10 <dbl>,
# BSSQ_11 <dbl>, BSSQ_12 <dbl>, BSSQ_13 <dbl>, BSSQ_14 <dbl>, BSSQ_15 <dbl>,
# BSSQ_16 <dbl>, BSTAI_1 <chr>, BSTAI_2 <chr>, BSTAI_3 <chr>, BSTAI_4 <chr>,
# BSTAI_5 <chr>, BSTAI_6 <chr>, BE <dbl>, BVRA <dbl>, choice_env <chr>,
# no_choice_env <chr>, PSTAI_1 <chr>, PSTAI_2 <chr>, PSTAI_3 <chr>,
# PSTAI_4 <chr>, PSTAI_5 <chr>, PSTAI_6 <chr>, PE <dbl>, PVRA <dbl>,
# ASSQ_1 <dbl>, ASSQ_2 <dbl>, ASSQ_3 <dbl>, ASSQ_4 <dbl>, ASSQ_5 <dbl>, …
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
data = all_data %>% select(age, gender, env, choice, BSSQ_5, ASSQ_5)
head(data)# A tibble: 6 × 6
age gender env choice BSSQ_5 ASSQ_5
<dbl> <chr> <chr> <chr> <dbl> <dbl>
1 30 Male Snowy CHOICE 0 2
2 37 Female Snowy CHOICE 4 3
3 28 Female Snowy NO_CHOICE 2 1
4 39 Female Snowy CHOICE 0 0
5 37 Male Snowy NO_CHOICE 0 0
6 18 Female Snowy CHOICE 4 0
mutated_data = mutate(data, focus_lvl_diff = ASSQ_5 - BSSQ_5)
head(mutated_data)# A tibble: 6 × 7
age gender env choice BSSQ_5 ASSQ_5 focus_lvl_diff
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 30 Male Snowy CHOICE 0 2 2
2 37 Female Snowy CHOICE 4 3 -1
3 28 Female Snowy NO_CHOICE 2 1 -1
4 39 Female Snowy CHOICE 0 0 0
5 37 Male Snowy NO_CHOICE 0 0 0
6 18 Female Snowy CHOICE 4 0 -4
filtered_data <- filter(mutated_data, gender != 'Other')
head(filtered_data)# A tibble: 6 × 7
age gender env choice BSSQ_5 ASSQ_5 focus_lvl_diff
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 30 Male Snowy CHOICE 0 2 2
2 37 Female Snowy CHOICE 4 3 -1
3 28 Female Snowy NO_CHOICE 2 1 -1
4 39 Female Snowy CHOICE 0 0 0
5 37 Male Snowy NO_CHOICE 0 0 0
6 18 Female Snowy CHOICE 4 0 -4
cleaning: data row yang gender other remove
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ readr 2.1.5
✔ ggplot2 3.5.1 ✔ stringr 1.5.1
✔ lubridate 1.9.4 ✔ tibble 3.2.1
✔ purrr 1.0.4 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ggplot(filtered_data, aes(x=age, y=focus_lvl_diff)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x="Age",
y="Difference in Focus Level",
title="Age vs ∆ Focus") `geom_smooth()` using formula = 'y ~ x'
library(ggplot2)
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
p = ggplot(filtered_data, aes(x=age, y=focus_lvl_diff, colour = gender)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x="Age",
y="Difference in Focus Level",
title="Age vs ∆ Focus")
ggplotly(p)`geom_smooth()` using formula = 'y ~ x'
p = ggplot(mutated_data, aes(x=age, y=focus_lvl_diff, colour = env)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x="Age",
y="Difference in Focus Level",
title="Age vs ∆ Focus")
ggplotly(p)`geom_smooth()` using formula = 'y ~ x'
ggplot(mutated_data, aes(x=age, y=focus_lvl_diff, colour = choice)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x="Age",
y="Difference in Focus Level",
title="Age vs ∆ Focus") `geom_smooth()` using formula = 'y ~ x'
library(ggplot2)
# Fitting the model
model = lm(focus_lvl_diff ~ age, data = mutated_data)
# Create a residual plot
ggplot(model, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
labs(title = "Residuals vs. Fitted Values")filtered_snowy = filter(mutated_data, env == 'Snowy')
filtered_sunny = filter(mutated_data, env == 'Sunny')model_snowy = lm(focus_lvl_diff ~ age, data = filtered_snowy)
# Create a residual plot
ggplot(model_snowy, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
labs(title = "Residuals vs. Fitted Values")model_sunny = lm(focus_lvl_diff ~ age, data = filtered_sunny)
# Create a residual plot
ggplot(model_sunny, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
labs(title = "Residuals vs. Fitted Values")how to plot the residual for both models? should we make two different tables?
2 articles max 50 words
library(readxl)
sheets = excel_sheets("all_data.xlsx")
print(sheets)[1] "data_dictionary" "experiment1" "experiment2" "experiment3"
[5] "pooled123"
all_data = read_excel("all_data.xlsx", sheet = sheets[5])
head(all_data)# A tibble: 6 × 82
Date PID BSSQ_1 BSSQ_2 BSSQ_3 BSSQ_4 BSSQ_5 BSSQ_6 BSSQ_7
<dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2021-06-30 16:55:00 1201 0 0 0 0 0 0 0
2 2021-06-30 17:46:00 1351 4 2 0 1 4 0 0
3 2021-06-30 18:25:00 2351 1 7 0 0 2 0 0
4 2021-06-30 18:59:00 1151 0 0 0 0 0 0 0
5 2021-06-30 20:25:00 2201 0 0 0 0 0 0 0
6 2021-06-30 21:05:00 1251 2 5 1 0 4 0 0
# ℹ 73 more variables: BSSQ_8 <dbl>, BSSQ_9 <dbl>, BSSQ_10 <dbl>,
# BSSQ_11 <dbl>, BSSQ_12 <dbl>, BSSQ_13 <dbl>, BSSQ_14 <dbl>, BSSQ_15 <dbl>,
# BSSQ_16 <dbl>, BSTAI_1 <chr>, BSTAI_2 <chr>, BSTAI_3 <chr>, BSTAI_4 <chr>,
# BSTAI_5 <chr>, BSTAI_6 <chr>, BE <dbl>, BVRA <dbl>, choice_env <chr>,
# no_choice_env <chr>, PSTAI_1 <chr>, PSTAI_2 <chr>, PSTAI_3 <chr>,
# PSTAI_4 <chr>, PSTAI_5 <chr>, PSTAI_6 <chr>, PE <dbl>, PVRA <dbl>,
# ASSQ_1 <dbl>, ASSQ_2 <dbl>, ASSQ_3 <dbl>, ASSQ_4 <dbl>, ASSQ_5 <dbl>, …
library(dplyr)
data = all_data %>% select(age, BSSQ_5, ASSQ_5)
head(data)# A tibble: 6 × 3
age BSSQ_5 ASSQ_5
<dbl> <dbl> <dbl>
1 30 0 2
2 37 4 3
3 28 2 1
4 39 0 0
5 37 0 0
6 18 4 0
mutated_data = mutate(data, focus_lvl_diff = ASSQ_5 - BSSQ_5)
head(mutated_data)# A tibble: 6 × 4
age BSSQ_5 ASSQ_5 focus_lvl_diff
<dbl> <dbl> <dbl> <dbl>
1 30 0 2 2
2 37 4 3 -1
3 28 2 1 -1
4 39 0 0 0
5 37 0 0 0
6 18 4 0 -4
very important No
library(tidyverse)
# ggplot(mutated_data, aes(x=age, y=focus_lvl_diff, colour = gender)) +
#geom_point() +
#geom_smooth(method = "lm", se = FALSE) +
#labs(x="Age",
# y="Difference in Focus Level",
# title="Age vs ∆ Focus") library(ggplot2)
# Fitting the model
model = lm(focus_lvl_diff ~ age, data = mutated_data)
# Create a residual plot
ggplot(model, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
labs(title = "Residuals vs. Fitted Values")ggplot(mutated_data, aes(x=age)) + geom_histogram()`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mutated_data, aes(x=focus_lvl_diff)) + geom_bar()just say that lm isn’t the perfect fit for the data. Dive deeper to the data, look into journals abt the correlation you’re researching.
how the effect of using vr